1 Effect of UPSTM-Based Decorrelation on Feature Discovery

1.0.1 Loading the libraries

library("FRESA.CAD")
library(readxl)
library(igraph)
library(umap)
library(tsne)
library(entropy)

op <- par(no.readonly = TRUE)
pander::panderOptions('digits', 3)
pander::panderOptions('table.split.table', 400)
pander::panderOptions('keep.trailing.zeros',TRUE)

1.1 Material and Methods

1.2 The Data

trainLabeled <- read.delim("~/GitHub/FCA/Data/trainSet.txt")
validLabeled <- read.delim("~/GitHub/FCA/Data/arcene_valid.txt")
wholeArceneSet <- rbind(trainLabeled,validLabeled)


wholeArceneSet$Labels <-  1*(wholeArceneSet$Labels > 0)
wholeArceneSet[,1:ncol(trainLabeled)] <- sapply(wholeArceneSet,as.double)

1.2.0.1 Standarize the names for the reporting

studyName <- "ARCENE"
dataframe <- wholeArceneSet
outcome <- "Labels"
thro <- 0.8
cexheat = 0.10

TopVariables <- 10

1.3 Generaring the report

1.3.1 Libraries

Some libraries

library(psych)
library(whitening)
library("vioplot")
library("rpart")

1.3.2 Data specs

pander::pander(c(rows=nrow(dataframe),col=ncol(dataframe)-1))
rows col
200 10000
pander::pander(table(dataframe[,outcome]))
0 1
112 88

varlist <- colnames(dataframe)
varlist <- varlist[varlist != outcome]

largeSet <- length(varlist) > 1500 

1.3.3 Scaling the data

Scaling and removing near zero variance columns and highly co-linear(r>0.99999) columns


  ### Some global cleaning
  sdiszero <- apply(dataframe,2,sd) > 1.0e-16
  dataframe <- dataframe[,sdiszero]

  varlist <- colnames(dataframe)[colnames(dataframe) != outcome]
  tokeep <- c(as.character(correlated_Remove(dataframe,varlist,thr=0.99999)),outcome)
  dataframe <- dataframe[,tokeep]

  varlist <- colnames(dataframe)
  varlist <- varlist[varlist != outcome]
  
  iscontinous <- sapply(apply(dataframe,2,unique),length) > 5 ## Only variables with enough samples



dataframeScaled <- FRESAScale(dataframe,method="OrderLogit")$scaledData

1.4 The heatmap of the data

numsub <- nrow(dataframe)
if (numsub > 1000) numsub <- 1000


if (!largeSet)
{

  hm <- heatMaps(data=dataframeScaled[1:numsub,],
                 Outcome=outcome,
                 Scale=TRUE,
                 hCluster = "row",
                 xlab="Feature",
                 ylab="Sample",
                 srtCol=45,
                 srtRow=45,
                 cexCol=cexheat,
                 cexRow=cexheat
                 )
  par(op)
}

1.4.0.1 Correlation Matrix of the Data

The heat map of the data


if (!largeSet)
{

  par(cex=0.6,cex.main=0.85,cex.axis=0.7)
  #cormat <- Rfast::cora(as.matrix(dataframe[,varlist]),large=TRUE)
  cormat <- cor(dataframe[,varlist],method="pearson")
  cormat[is.na(cormat)] <- 0
  gplots::heatmap.2(abs(cormat),
                    trace = "none",
  #                  scale = "row",
                    mar = c(5,5),
                    col=rev(heat.colors(5)),
                    main = "Original Correlation",
                    cexRow = cexheat,
                    cexCol = cexheat,
                     srtCol=45,
                     srtRow=45,
                    key.title=NA,
                    key.xlab="|Pearson Correlation|",
                    xlab="Feature", ylab="Feature")
  diag(cormat) <- 0
  print(max(abs(cormat)))
}

1.5 The decorrelation


DEdataframe <- IDeA(dataframe,verbose=TRUE,thr=thro)
#> 
#>  Included: 7748 , Uni p: 0.01544774 , Uncorrelated Base: 920 , Outcome-Driven Size: 0 , Base Size: 920 
#> 
#> 
 1 <R=1.000,r=0.975,N= 6654>, Top: 373( 47 )...=[ 2 : 373 Fa= 371 : 0.991 ]( 371 , 2281 , 0 ),<|>Tot Used: 2652 , Added: 2281 , Zero Std: 0 , Max Cor: 1.000
#> 
 2 <R=1.000,r=0.975,N= 6654>, Top: 875( 22 )........=[ 2 : 875 Fa= 1233 : 0.992 ]( 862 , 2079 , 371 ),<|>Tot Used: 4588 , Added: 2079 , Zero Std: 0 , Max Cor: 1.000
#> 
 3 <R=1.000,r=0.975,N= 6654>, Top: 738( 9 ).......=[ 2 : 738 Fa= 1957 : 0.992 ]( 729 , 1470 , 1233 ),<|>Tot Used: 5682 , Added: 1470 , Zero Std: 0 , Max Cor: 1.000
#> 
 4 <R=1.000,r=0.975,N= 6654>, Top: 422( 17 )....=[ 2 : 422 Fa= 2371 : 0.990 ]( 416 , 767 , 1957 ),<|>Tot Used: 6199 , Added: 767 , Zero Std: 0 , Max Cor: 1.000
#> 
 5 <R=1.000,r=0.975,N= 6654>, Top: 187( 9 ).=[ 2 : 187 Fa= 2557 : 0.989 ]( 186 , 291 , 2371 ),<|>Tot Used: 6387 , Added: 291 , Zero Std: 0 , Max Cor: 0.999
#> 
 6 <R=0.999,r=0.950,N= 3536>, Top: 1248( 7 )............=[ 2 : 1248 Fa= 2945 : 0.970 ]( 1200 , 1565 , 2557 ),<|>Tot Used: 6600 , Added: 1565 , Zero Std: 0 , Max Cor: 0.999
#> 
 7 <R=0.999,r=0.950,N= 3536>, Top: 498( 7 )....[ 1 : 498 Fa= 3060 : 0.950 ]( 487 , 548 , 2945 ),<|>Tot Used: 6666 , Added: 548 , Zero Std: 0 , Max Cor: 0.999
#> 
 8 <R=0.999,r=0.950,N= 3536>, Top: 120( 6 ).[ 1 : 120 Fa= 3087 : 0.950 ]( 119 , 141 , 3060 ),<|>Tot Used: 6687 , Added: 141 , Zero Std: 0 , Max Cor: 0.999
#> 
 9 <R=0.999,r=0.900,N= 2892>, Top: 1117( 1 )..........[ 1 : 1117 Fa= 3332 : 0.923 ]( 1075 , 1330 , 3087 ),<|>Tot Used: 6768 , Added: 1330 , Zero Std: 0 , Max Cor: 0.999
#> 
 10 <R=0.999,r=0.899,N= 2892>, Top: 277( 1 )..=[ 2 : 277 Fa= 3398 : 0.940 ]( 273 , 358 , 3332 ),<|>Tot Used: 6775 , Added: 358 , Zero Std: 0 , Max Cor: 0.998
#> 
 11 <R=0.998,r=0.899,N= 2892>, Top: 74( 2 )[ 1 : 74 Fa= 3417 : 0.899 ]( 72 , 124 , 3398 ),<|>Tot Used: 6780 , Added: 124 , Zero Std: 0 , Max Cor: 0.996
#> 
 12 <R=0.996,r=0.848,N= 2071>, Top: 754( 1 ).......[ 1 : 754 Fa= 3526 : 0.861 ]( 734 , 943 , 3417 ),<|>Tot Used: 6798 , Added: 943 , Zero Std: 0 , Max Cor: 0.995
#> 
 13 <R=0.995,r=0.848,N= 2071>, Top: 234( 1 )..[ 1 : 234 Fa= 3561 : 0.848 ]( 222 , 329 , 3526 ),<|>Tot Used: 6799 , Added: 329 , Zero Std: 0 , Max Cor: 0.995
#> 
 14 <R=0.995,r=0.847,N= 2071>, Top: 59( 1 )[ 1 : 59 Fa= 3564 : 0.847 ]( 58 , 88 , 3561 ),<|>Tot Used: 6803 , Added: 88 , Zero Std: 0 , Max Cor: 0.958
#> 
 15 <R=0.958,r=0.800,N= 1586>, Top: 568( 1 ).....[ 1 : 568 Fa= 3635 : 0.800 ]( 559 , 721 , 3564 ),<|>Tot Used: 6813 , Added: 721 , Zero Std: 0 , Max Cor: 0.988
#> 
 16 <R=0.988,r=0.800,N= 1586>, Top: 155( 1 ).[ 1 : 155 Fa= 3656 : 0.800 ]( 153 , 240 , 3635 ),<|>Tot Used: 6815 , Added: 240 , Zero Std: 0 , Max Cor: 0.965
#> 
 17 <R=0.965,r=0.800,N= 1586>, Top: 45( 1 )[ 1 : 45 Fa= 3661 : 0.800 ]( 45 , 67 , 3656 ),<|>Tot Used: 6815 , Added: 67 , Zero Std: 0 , Max Cor: 0.846
#> 
 18 <R=0.846,r=0.800,N=   15>, Top: 6( 4 )[ 1 : 6 Fa= 3661 : 0.800 ]( 6 , 9 , 3661 ),<|>Tot Used: 6815 , Added: 9 , Zero Std: 0 , Max Cor: 0.891
#> 
 19 <R=0.891,r=0.800,N=   15>, Top: 2( 1 )[ 1 : 2 Fa= 3662 : 0.800 ]( 2 , 2 , 3661 ),<|>Tot Used: 6815 , Added: 2 , Zero Std: 0 , Max Cor: 0.800
#> 
 20 <R=0.800,r=0.800,N=    0>
#> 
 [ 20 ], 0.7999522 Decor Dimension: 6815 Nused: 6815 . Cor to Base: 1356 , ABase: 50 , Outcome Base: 0 
#> 
varlistc <- colnames(DEdataframe)[colnames(DEdataframe) != outcome]

pander::pander(sum(apply(dataframe[,varlist],2,var)))

63594442

pander::pander(sum(apply(DEdataframe[,varlistc],2,var)))

6183186

pander::pander(entropy(discretize(unlist(dataframe[,varlist]), 256)))

3.08

pander::pander(entropy(discretize(unlist(DEdataframe[,varlistc]), 256)))

1.7

1.5.1 The decorrelation matrix


if (!largeSet)
{

  par(cex=0.6,cex.main=0.85,cex.axis=0.7)
  
  UPSTM <- attr(DEdataframe,"UPSTM")
  
  gplots::heatmap.2(1.0*(abs(UPSTM)>0),
                    trace = "none",
                    mar = c(5,5),
                    col=rev(heat.colors(5)),
                    main = "Decorrelation matrix",
                    cexRow = cexheat,
                    cexCol = cexheat,
                   srtCol=45,
                   srtRow=45,
                    key.title=NA,
                    key.xlab="|Beta|>0",
                    xlab="Output Feature", ylab="Input Feature")
  
  par(op)
}

1.6 The heatmap of the decorrelated data

if (!largeSet)
{

  hm <- heatMaps(data=DEdataframe[1:numsub,],
                 Outcome=outcome,
                 Scale=TRUE,
                 hCluster = "row",
                 cexRow = cexheat,
                 cexCol = cexheat,
                 srtCol=45,
                 srtRow=45,
                 xlab="Feature",
                 ylab="Sample")
  par(op)
}

1.7 The correlation matrix after decorrelation

if (!largeSet)
{

  cormat <- cor(DEdataframe[,varlistc],method="pearson")
  cormat[is.na(cormat)] <- 0
  
  gplots::heatmap.2(abs(cormat),
                    trace = "none",
                    mar = c(5,5),
                    col=rev(heat.colors(5)),
                    main = "Correlation after IDeA",
                    cexRow = cexheat,
                    cexCol = cexheat,
                     srtCol=45,
                     srtRow=45,
                    key.title=NA,
                    key.xlab="|Pearson Correlation|",
                    xlab="Feature", ylab="Feature")
  
  par(op)
  diag(cormat) <- 0
  print(max(abs(cormat)))
}

1.8 U-MAP Visualization of features

1.8.1 The UMAP based on LASSO on Raw Data


if (nrow(dataframe) < 1000)
{
  classes <- unique(dataframe[1:numsub,outcome])
  raincolors <- rainbow(length(classes))
  names(raincolors) <- classes
  datasetframe.umap = umap(scale(dataframe[1:numsub,varlist]),n_components=2)
  plot(datasetframe.umap$layout,xlab="U1",ylab="U2",main="UMAP: Original",t='n')
  text(datasetframe.umap$layout,labels=dataframe[1:numsub,outcome],col=raincolors[dataframe[1:numsub,outcome]+1])
}

1.8.2 The decorralted UMAP

if (nrow(dataframe) < 1000)
{

  datasetframe.umap = umap(scale(DEdataframe[1:numsub,varlistc]),n_components=2)
  plot(datasetframe.umap$layout,xlab="U1",ylab="U2",main="UMAP: After IDeA",t='n')
  text(datasetframe.umap$layout,labels=DEdataframe[1:numsub,outcome],col=raincolors[DEdataframe[1:numsub,outcome]+1])
}

1.9 Univariate Analysis

1.9.1 Univariate



univarRAW <- uniRankVar(varlist,
               paste(outcome,"~1"),
               outcome,
               dataframe,
               rankingTest="AUC")

100 : V100 200 : V201 300 : V302 400 : V402 500 : V504
600 : V606 700 : V707 800 : V807 900 : V907 1000 : V1008
1100 : V1108 1200 : V1209 1300 : V1309 1400 : V1409 1500 : V1509
1600 : V1610 1700 : V1710 1800 : V1810 1900 : V1911 2000 : V2012
2100 : V2113 2200 : V2213 2300 : V2313 2400 : V2417 2500 : V2518
2600 : V2620 2700 : V2722 2800 : V2822 2900 : V2922 3000 : V3023
3100 : V3123 3200 : V3223 3300 : V3326 3400 : V3428 3500 : V3528
3600 : V3629 3700 : V3734 3800 : V3835 3900 : V3935 4000 : V4038
4100 : V4140 4200 : V4243 4300 : V4344 4400 : V4445 4500 : V4547
4600 : V4649 4700 : V4751 4800 : V4853 4900 : V4954 5000 : V5055
5100 : V5156 5200 : V5256 5300 : V5360 5400 : V5462 5500 : V5564
5600 : V5666 5700 : V5768 5800 : V5868 5900 : V5970 6000 : V6070
6100 : V6171 6200 : V6271 6300 : V6372 6400 : V6473 6500 : V6573
6600 : V6675 6700 : V6777 6800 : V6881 6900 : V6983 7000 : V7088
7100 : V7190 7200 : V7291 7300 : V7393 7400 : V7496 7500 : V7597
7600 : V7701 7700 : V7803 7800 : V7904 7900 : V8007 8000 : V8108
8100 : V8209 8200 : V8310 8300 : V8414 8400 : V8516 8500 : V8620
8600 : V8721 8700 : V8822 8800 : V8925 8900 : V9026 9000 : V9128
9100 : V9232 9200 : V9332 9300 : V9433 9400 : V9533 9500 : V9638
9600 : V9739 9700 : V9841 9800 : V9944




univarDe <- uniRankVar(varlistc,
               paste(outcome,"~1"),
               outcome,
               DEdataframe,
               rankingTest="AUC",
               )

100 : V100 200 : La_V201 300 : V302 400 : La_V402 500 : V504
600 : V606 700 : V707 800 : V807 900 : La_V907 1000 : La_V1008
1100 : La_V1108 1200 : V1209 1300 : La_V1309 1400 : La_V1409 1500 : La_V1509
1600 : La_V1610 1700 : La_V1710 1800 : La_V1810 1900 : La_V1911 2000 : La_V2012
2100 : La_V2113 2200 : La_V2213 2300 : V2313 2400 : La_V2417 2500 : La_V2518
2600 : V2620 2700 : La_V2722 2800 : V2822 2900 : V2922 3000 : La_V3023
3100 : La_V3123 3200 : La_V3223 3300 : V3326 3400 : V3428 3500 : La_V3528
3600 : La_V3629 3700 : La_V3734 3800 : La_V3835 3900 : La_V3935 4000 : La_V4038
4100 : La_V4140 4200 : La_V4243 4300 : V4344 4400 : V4445 4500 : La_V4547
4600 : La_V4649 4700 : La_V4751 4800 : V4853 4900 : La_V4954 5000 : La_V5055
5100 : V5156 5200 : La_V5256 5300 : V5360 5400 : La_V5462 5500 : La_V5564
5600 : La_V5666 5700 : La_V5768 5800 : La_V5868 5900 : La_V5970 6000 : La_V6070
6100 : La_V6171 6200 : La_V6271 6300 : La_V6372 6400 : La_V6473 6500 : V6573
6600 : La_V6675 6700 : La_V6777 6800 : V6881 6900 : La_V6983 7000 : La_V7088
7100 : La_V7190 7200 : V7291 7300 : La_V7393 7400 : La_V7496 7500 : La_V7597
7600 : La_V7701 7700 : La_V7803 7800 : La_V7904 7900 : V8007 8000 : La_V8108
8100 : La_V8209 8200 : La_V8310 8300 : La_V8414 8400 : La_V8516 8500 : V8620
8600 : La_V8721 8700 : La_V8822 8800 : La_V8925 8900 : V9026 9000 : V9128
9100 : La_V9232 9200 : La_V9332 9300 : La_V9433 9400 : V9533 9500 : La_V9638
9600 : La_V9739 9700 : La_V9841 9800 : V9944

1.9.2 Final Table


univariate_columns <- c("caseMean","caseStd","controlMean","controlStd","controlKSP","ROCAUC")

##topfive
topvar <- c(1:length(varlist)) <= TopVariables
pander::pander(univarRAW$orderframe[topvar,univariate_columns])
  caseMean caseStd controlMean controlStd controlKSP ROCAUC
V5005 314.7 72.9 239 83.6 0.18466 0.772
V4960 47.5 49.3 124 97.3 0.19534 0.751
V2309 43.1 45.3 113 89.0 0.18665 0.751
V8368 44.9 46.1 116 90.6 0.21091 0.751
V312 47.2 48.0 122 94.7 0.21678 0.750
V3365 46.3 46.9 119 92.5 0.22139 0.749
V9617 40.9 44.7 109 87.5 0.15591 0.749
V414 47.5 50.4 125 100.4 0.16265 0.749
V9092 33.9 63.1 124 132.6 0.00199 0.748
V1936 316.0 79.5 243 79.2 0.28495 0.748


topLAvar <- univarDe$orderframe$Name[str_detect(univarDe$orderframe$Name,"La_")]
topLAvar <- unique(c(univarDe$orderframe$Name[topvar],topLAvar[1:as.integer(TopVariables/2)]))
finalTable <- univarDe$orderframe[topLAvar,univariate_columns]

theLaVar <- rownames(finalTable)[str_detect(rownames(finalTable),"La_")]

pander::pander(univarDe$orderframe[topLAvar,univariate_columns])
  caseMean caseStd controlMean controlStd controlKSP ROCAUC
La_V9413 -5.86 7.833 5.081 9.70 2.68e-05 0.816
La_V7908 24.72 17.813 3.389 19.31 1.34e-03 0.802
La_V316 -8.88 8.424 -0.250 9.09 1.78e-04 0.789
La_V6132 -2.98 4.591 6.202 11.86 1.18e-07 0.766
La_V1620 4.47 3.705 -0.125 5.92 3.93e-05 0.764
La_V2652 5.59 10.949 -3.707 11.11 2.47e-03 0.757
La_V8996 4.26 3.278 1.231 3.19 5.62e-02 0.751
La_V400 -10.15 9.172 -1.096 9.23 9.45e-04 0.746
La_V1163 -1.10 0.836 0.142 1.86 1.52e-10 0.743
La_V2536 -3.26 3.193 -0.180 4.24 3.08e-05 0.735

dc <- getLatentCoefficients(DEdataframe)
fscores <- attr(DEdataframe,"fscore")

theSigDc <- dc[theLaVar]
names(theSigDc) <- NULL
theSigDc <- unique(names(unlist(theSigDc)))


theFormulas <- dc[rownames(finalTable)]
deFromula <- character(length(theFormulas))
names(deFromula) <- rownames(finalTable)

pander::pander(c(mean=mean(sapply(dc,length)),total=length(dc),fraction=length(dc)/(ncol(dataframe)-1)))
mean total fraction
3.14 6619 0.672


allSigvars <- names(dc)



dx <- names(deFromula)[1]
for (dx in names(deFromula))
{
  coef <- theFormulas[[dx]]
  cname <- names(theFormulas[[dx]])
  names(cname) <- cname
  for (cf in names(coef))
  {
    if (cf != dx)
    {
      if (coef[cf]>0)
      {
        deFromula[dx] <- paste(deFromula[dx],
                               sprintf("+ %5.3f*%s",coef[cf],cname[cf]))
      }
      else
      {
        deFromula[dx] <- paste(deFromula[dx],
                               sprintf("%5.3f*%s",coef[cf],cname[cf]))
      }
    }
  }
}

finalTable <- rbind(finalTable,univarRAW$orderframe[theSigDc[!(theSigDc %in% rownames(finalTable))],univariate_columns])


orgnamez <- rownames(finalTable)
orgnamez <- str_remove_all(orgnamez,"La_")
finalTable$RAWAUC <- univarRAW$orderframe[orgnamez,"ROCAUC"]
finalTable$DecorFormula <- deFromula[rownames(finalTable)]
finalTable$fscores <- fscores[rownames(finalTable)]

Final_Columns <- c("DecorFormula","caseMean","caseStd","controlMean","controlStd","controlKSP","ROCAUC","RAWAUC","fscores")

finalTable <- finalTable[order(-finalTable$ROCAUC),]
pander::pander(finalTable[,Final_Columns])
  DecorFormula caseMean caseStd controlMean controlStd controlKSP ROCAUC RAWAUC fscores
La_V9413 -1.062V3945 + 1.000V9413 -5.86 7.833 5.081 9.70 2.68e-05 0.816 0.547 4
La_V7908 -2.243V2026 -1.042V4192 + 2.264V6826 + 1.000V7908 24.72 17.813 3.389 19.31 1.34e-03 0.802 0.729 -1
La_V316 + 1.000V316 -1.074V6388 -8.88 8.424 -0.250 9.09 1.78e-04 0.789 0.545 2
La_V6132 -0.933V845 + 1.000V6132 -2.98 4.591 6.202 11.86 1.18e-07 0.766 0.628 1
La_V1620 + 1.000V1620 -1.054V2371 4.47 3.705 -0.125 5.92 3.93e-05 0.764 0.537 1
La_V2652 + 1.383V298 + 1.000V2652 -1.561V5748 -0.899V9074 5.59 10.949 -3.707 11.11 2.47e-03 0.757 0.624 -1
La_V8996 -1.037V649 -0.372V2026 + 0.375V6826 + 1.334V8402 -0.281*V8996 4.26 3.278 1.231 3.19 5.62e-02 0.751 0.690 1
La_V400 + 1.000V400 -0.952V521 -10.15 9.172 -1.096 9.23 9.45e-04 0.746 0.545 2
La_V1163 + 1.000V1163 -0.490V4126 -0.516*V7512 -1.10 0.836 0.142 1.86 1.52e-10 0.743 0.549 -2
La_V2536 -1.206V1788 + 1.000V2536 + 0.203*V6444 -3.26 3.193 -0.180 4.24 3.08e-05 0.735 0.597 -2
V7908 NA 367.01 216.188 219.241 182.76 6.24e-05 0.729 0.729 NA
V4192 NA 347.32 194.538 222.893 183.71 4.84e-05 0.725 0.725 NA
V8996 NA 105.10 71.796 56.955 57.93 1.86e-03 0.690 0.690 NA
V8402 NA 101.39 70.064 60.286 58.00 1.79e-03 0.670 0.670 NA
V649 NA 100.98 66.569 63.554 57.64 1.73e-03 0.667 0.667 NA
V845 NA 154.08 120.432 102.446 119.50 3.78e-04 0.648 0.648 NA
V2026 NA 202.09 88.768 253.812 124.53 3.78e-03 0.634 0.634 NA
V6132 NA 140.75 111.236 101.768 114.76 7.54e-04 0.628 0.628 NA
V2652 NA 206.14 136.182 143.571 137.79 1.23e-03 0.624 0.624 NA
V6444 NA 136.08 113.563 91.643 103.79 5.84e-04 0.623 0.623 NA
V6826 NA 208.90 91.170 258.750 123.92 1.18e-02 0.620 0.620 NA
V9074 NA 203.03 153.722 148.054 150.51 1.62e-04 0.615 0.615 NA
V1788 NA 91.43 67.831 65.071 66.60 2.19e-03 0.613 0.613 NA
V2536 NA 79.44 57.450 59.732 59.90 6.15e-04 0.597 0.597 NA
V3945 NA 123.56 102.080 101.902 102.09 2.69e-03 0.571 0.571 NA
V6388 NA 110.74 105.732 82.929 87.26 2.87e-03 0.567 0.567 NA
V7512 NA 169.35 101.193 144.098 117.59 3.23e-04 0.560 0.560 NA
V5748 NA 100.36 75.725 85.402 77.29 8.04e-04 0.556 0.556 NA
V521 NA 174.02 170.382 160.955 165.07 1.51e-04 0.554 0.554 NA
V298 NA 100.22 87.035 86.116 84.21 3.54e-03 0.549 0.549 NA
V1163 NA 172.58 99.246 150.027 121.06 2.52e-04 0.549 0.549 NA
V9413 NA 125.39 109.470 113.321 109.31 3.60e-03 0.547 0.547 NA
V316 NA 110.00 112.162 88.777 96.85 1.45e-03 0.545 0.545 NA
V400 NA 155.44 163.730 152.062 156.57 1.12e-05 0.545 0.545 NA
V4126 NA 176.26 99.204 154.268 123.15 1.99e-04 0.538 0.538 NA
V1620 NA 34.19 26.851 31.830 33.88 1.01e-04 0.537 0.537 NA
V2371 NA 28.20 23.897 30.321 32.33 1.05e-04 0.490 0.490 NA

1.10 Comparing IDeA vs PCA vs EFA

1.10.1 PCA

featuresnames <- colnames(dataframe)[colnames(dataframe) != outcome]
pc <- prcomp(dataframe[,iscontinous],center = TRUE,tol=0.002)   #principal components
predPCA <- predict(pc,dataframe[,iscontinous])
PCAdataframe <- as.data.frame(cbind(predPCA,dataframe[,!iscontinous]))
colnames(PCAdataframe) <- c(colnames(predPCA),colnames(dataframe)[!iscontinous]) 
#plot(PCAdataframe[,colnames(PCAdataframe)!=outcome],col=dataframe[,outcome],cex=0.65,cex.lab=0.5,cex.axis=0.75,cex.sub=0.5,cex.main=0.75)

#pander::pander(pc$rotation)


PCACor <- cor(PCAdataframe[,colnames(PCAdataframe) != outcome])


  gplots::heatmap.2(abs(PCACor),
                    trace = "none",
  #                  scale = "row",
                    mar = c(5,5),
                    col=rev(heat.colors(5)),
                    main = "PCA Correlation",
                    cexRow = 0.5,
                    cexCol = 0.5,
                     srtCol=45,
                     srtRow= -45,
                    key.title=NA,
                    key.xlab="Pearson Correlation",
                    xlab="Feature", ylab="Feature")

1.10.2 EFA


EFAdataframe <- dataframeScaled

if (length(iscontinous) < 2000)
{
  topred <- min(length(iscontinous),nrow(dataframeScaled),ncol(predPCA)/2)
  if (topred < 2) topred <- 2
  
  uls <- fa(dataframeScaled[,iscontinous],nfactors=topred,rotate="varimax",warnings=FALSE)  # EFA analysis
  predEFA <- predict(uls,dataframeScaled[,iscontinous])
  EFAdataframe <- as.data.frame(cbind(predEFA,dataframeScaled[,!iscontinous]))
  colnames(EFAdataframe) <- c(colnames(predEFA),colnames(dataframeScaled)[!iscontinous]) 


  
  EFACor <- cor(EFAdataframe[,colnames(EFAdataframe) != outcome])
  
  
    gplots::heatmap.2(abs(EFACor),
                      trace = "none",
    #                  scale = "row",
                      mar = c(5,5),
                      col=rev(heat.colors(5)),
                      main = "EFA Correlation",
                      cexRow = 0.5,
                      cexCol = 0.5,
                       srtCol=45,
                       srtRow= -45,
                      key.title=NA,
                      key.xlab="Pearson Correlation",
                      xlab="Feature", ylab="Feature")
}

1.11 Effect on CAR modeling

par(op)
par(xpd = TRUE)
dataframe[,outcome] <- factor(dataframe[,outcome])
rawmodel <- rpart(paste(outcome,"~."),dataframe,control=rpart.control(maxdepth=3))
pr <- predict(rawmodel,dataframe,type = "class")

  ptab <- list(er="Error",detail=matrix(nrow=6,ncol=1))
  if (length(unique(pr))>1)
  {
    plot(rawmodel,main="Raw",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
    text(rawmodel, use.n = TRUE,cex=0.75)
    ptab <- epiR::epi.tests(table(pr==0,dataframe[,outcome]==0))
  }


pander::pander(table(dataframe[,outcome],pr))
  0 1
0 109 3
1 13 75
pander::pander(ptab)
  • detail:

    statistic est lower upper
    ap 0.3900 0.32199 0.4613
    tp 0.4400 0.37006 0.5117
    se 0.8523 0.76064 0.9189
    sp 0.9732 0.92371 0.9944
    diag.ac 0.9200 0.87334 0.9536
    diag.or 209.6154 57.73802 760.9996
    nndx 1.2114 1.09485 1.4612
    youden 0.8255 0.68435 0.9134
    pv.pos 0.9615 0.89169 0.9920
    pv.neg 0.8934 0.82468 0.9420
    lr.pos 31.8182 10.38462 97.4900
    lr.neg 0.1518 0.09181 0.2510
    p.rout 0.6100 0.53868 0.6780
    p.rin 0.3900 0.32199 0.4613
    p.tpdn 0.0268 0.00556 0.0763
    p.tndp 0.1477 0.08107 0.2394
    p.dntp 0.0385 0.00800 0.1083
    p.dptn 0.1066 0.05797 0.1753
  • tab:

      Outcome + Outcome - Total
    Test + 75 3 78
    Test - 13 109 122
    Total 88 112 200
  • method: exact

  • digits: 2

  • conf.level: 0.95

pander::pander(ptab$detail[c(5,3,4,6),])
  statistic est lower upper
5 diag.ac 0.920 0.873 0.954
3 se 0.852 0.761 0.919
4 sp 0.973 0.924 0.994
6 diag.or 209.615 57.738 761.000

par(op)
par(xpd = TRUE)
DEdataframe[,outcome] <- factor(DEdataframe[,outcome])
IDeAmodel <- rpart(paste(outcome,"~."),DEdataframe,control=rpart.control(maxdepth=3))
pr <- predict(IDeAmodel,DEdataframe,type = "class")

  ptab <- list(er="Error",detail=matrix(nrow=6,ncol=1))
  if (length(unique(pr))>1)
  {
    plot(IDeAmodel,main="IDeA",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
    text(IDeAmodel, use.n = TRUE,cex=0.75)
    ptab <- epiR::epi.tests(table(pr==0,DEdataframe[,outcome]==0))
  }

pander::pander(table(DEdataframe[,outcome],pr))
  0 1
0 106 6
1 10 78
pander::pander(ptab)
  • detail:

    statistic est lower upper
    ap 0.4200 0.3507 0.492
    tp 0.4400 0.3701 0.512
    se 0.8864 0.8009 0.944
    sp 0.9464 0.8870 0.980
    diag.ac 0.9200 0.8733 0.954
    diag.or 137.8000 48.0526 395.168
    nndx 1.2008 1.0820 1.454
    youden 0.8328 0.6880 0.924
    pv.pos 0.9286 0.8510 0.973
    pv.neg 0.9138 0.8472 0.958
    lr.pos 16.5455 7.5693 36.166
    lr.neg 0.1201 0.0669 0.216
    p.rout 0.5800 0.5083 0.649
    p.rin 0.4200 0.3507 0.492
    p.tpdn 0.0536 0.0199 0.113
    p.tndp 0.1136 0.0559 0.199
    p.dntp 0.0714 0.0267 0.149
    p.dptn 0.0862 0.0421 0.153
  • tab:

      Outcome + Outcome - Total
    Test + 78 6 84
    Test - 10 106 116
    Total 88 112 200
  • method: exact

  • digits: 2

  • conf.level: 0.95

pander::pander(ptab$detail[c(5,3,4,6),])
  statistic est lower upper
5 diag.ac 0.920 0.873 0.954
3 se 0.886 0.801 0.944
4 sp 0.946 0.887 0.980
6 diag.or 137.800 48.053 395.168

par(op)
par(xpd = TRUE)
PCAdataframe[,outcome] <- factor(PCAdataframe[,outcome])
PCAmodel <- rpart(paste(outcome,"~."),PCAdataframe,control=rpart.control(maxdepth=3))
pr <- predict(PCAmodel,PCAdataframe,type = "class")
ptab <- list(er="Error",detail=matrix(nrow=6,ncol=1))
if (length(unique(pr))>1)
{
  plot(PCAmodel,main="PCA",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
  text(PCAmodel, use.n = TRUE,cex=0.75)
  ptab <- epiR::epi.tests(table(pr==0,PCAdataframe[,outcome]==0))
}

pander::pander(table(PCAdataframe[,outcome],pr))
  0 1
0 96 16
1 16 72
pander::pander(ptab)
  • detail:

    statistic est lower upper
    ap 0.440 0.3701 0.512
    tp 0.440 0.3701 0.512
    se 0.818 0.7216 0.892
    sp 0.857 0.7784 0.916
    diag.ac 0.840 0.7817 0.888
    diag.or 27.000 12.6607 57.580
    nndx 1.481 1.2370 2.000
    youden 0.675 0.5000 0.808
    pv.pos 0.818 0.7216 0.892
    pv.neg 0.857 0.7784 0.916
    lr.pos 5.727 3.6003 9.111
    lr.neg 0.212 0.1353 0.333
    p.rout 0.560 0.4883 0.630
    p.rin 0.440 0.3701 0.512
    p.tpdn 0.143 0.0839 0.222
    p.tndp 0.182 0.1076 0.278
    p.dntp 0.182 0.1076 0.278
    p.dptn 0.143 0.0839 0.222
  • tab:

      Outcome + Outcome - Total
    Test + 72 16 88
    Test - 16 96 112
    Total 88 112 200
  • method: exact

  • digits: 2

  • conf.level: 0.95

pander::pander(ptab$detail[c(5,3,4,6),])
  statistic est lower upper
5 diag.ac 0.840 0.782 0.888
3 se 0.818 0.722 0.892
4 sp 0.857 0.778 0.916
6 diag.or 27.000 12.661 57.580


par(op)

1.11.1 EFA


  EFAdataframe[,outcome] <- factor(EFAdataframe[,outcome])
  EFAmodel <- rpart(paste(outcome,"~."),EFAdataframe,control=rpart.control(maxdepth=3))
  pr <- predict(EFAmodel,EFAdataframe,type = "class")
  
  ptab <- list(er="Error",detail=matrix(nrow=6,ncol=1))
  if (length(unique(pr))>1)
  {
    plot(EFAmodel,main="EFA",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
    text(EFAmodel, use.n = TRUE,cex=0.75)
    ptab <- epiR::epi.tests(table(pr==0,EFAdataframe[,outcome]==0))
  }


  pander::pander(table(EFAdataframe[,outcome],pr))
  0 1
0 109 3
1 13 75
  pander::pander(ptab)
  • detail:

    statistic est lower upper
    ap 0.3900 0.32199 0.4613
    tp 0.4400 0.37006 0.5117
    se 0.8523 0.76064 0.9189
    sp 0.9732 0.92371 0.9944
    diag.ac 0.9200 0.87334 0.9536
    diag.or 209.6154 57.73802 760.9996
    nndx 1.2114 1.09485 1.4612
    youden 0.8255 0.68435 0.9134
    pv.pos 0.9615 0.89169 0.9920
    pv.neg 0.8934 0.82468 0.9420
    lr.pos 31.8182 10.38462 97.4900
    lr.neg 0.1518 0.09181 0.2510
    p.rout 0.6100 0.53868 0.6780
    p.rin 0.3900 0.32199 0.4613
    p.tpdn 0.0268 0.00556 0.0763
    p.tndp 0.1477 0.08107 0.2394
    p.dntp 0.0385 0.00800 0.1083
    p.dptn 0.1066 0.05797 0.1753
  • tab:

      Outcome + Outcome - Total
    Test + 75 3 78
    Test - 13 109 122
    Total 88 112 200
  • method: exact

  • digits: 2

  • conf.level: 0.95

  pander::pander(ptab$detail[c(5,3,4,6),])
  statistic est lower upper
5 diag.ac 0.920 0.873 0.954
3 se 0.852 0.761 0.919
4 sp 0.973 0.924 0.994
6 diag.or 209.615 57.738 761.000
  par(op)